SC RNA-Seq: Checking for duplicates with dupRadar

Libraries required

library(dupRadar)
library(biomaRt)
library(parallel)
library(viridis)
library(dplyr)

Bam files (marked duplicates)

bamFiles <- list.files(path = "./input", pattern = ".bam$", full.names = T)
names(bamFiles) <- as.character(unlist(sapply(bamFiles, function(x) strsplit(x, "\\.|\\/")[[1]][4])))

Analyze duplicates

dm <- NULL
if (!file.exists("./output/analyzedDuplicates.rds")) {
    dm <- lapply(bamFiles, function(x) {
        a <- analyzeDuprates(bam = x, gtf = "./input/gencode.vM18.chr_patch_hapl_scaff.annotation.gff3", 
            stranded = 0, paired = F, threads = detectCores() - 1)

        # removing versions from gene IDs
        a$ID <- as.character(sapply(as.character(a$ID), function(y) strsplit(y, "\\.")[[1]][1]))
        return(a)
    })
    saveRDS(object = dm, file = "./output/analyzedDuplicates.rds")
} else {
    dm <- readRDS("./output/analyzedDuplicates.rds")
}

Plotting and interpretation

The number of reads per base assigned to a gene in an ideal RNA-Seq data set is expected to be proportional to the abundance of its transcripts in the sample. For lowly expressed genes we expect read duplication to happen rarely by chance, while for highly expressed genes - depending on the total sequencing depth - we expect read duplication to happen often.

A good way to learn if a dataset is following this trend is by relating the normalized number of counts per gene (RPK, as a quantification of the gene expression) and the fraction represented by duplicated reads.

A duprate plot (blue cloud)

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    duprateExpDensPlot(DupMat = dm[[i]], pal = viridis(n = 1000), main = name)
    cat("\n \n")
}

PND15_11_B1

PND15_12_B1

PND15_18_B2

PND15_20_B2

PND15_21_B2

PND15_22_B2

PND8_09_B1

PND8_10_B1

PND8_13_B2

PND8_14_B2

PND8_16_B2

PND8_17_B2

PNW21_01_B1

PNW21_02_B1

PNW21_03_B1

PNW21_04_B1

PNW21_05_B1

PNW21_07_B1

Duprate Boxplot

The duprateExpBoxplot plot shows the range of the duplication rates at 5% bins (default) along the distribution of RPK gene counts. The x-axis displays the quantile of the RPK distribution, and the average RPK of the genes contained in this quantile.

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    duprateExpBoxplot(DupMat = dm[[i]], main = name)
    cat("\n \n")
}

PND15_11_B1

PND15_12_B1

PND15_18_B2

PND15_20_B2

PND15_21_B2

PND15_22_B2

PND8_09_B1

PND8_10_B1

PND8_13_B2

PND8_14_B2

PND8_16_B2

PND8_17_B2

PNW21_01_B1

PNW21_02_B1

PNW21_03_B1

PNW21_04_B1

PNW21_05_B1

PNW21_07_B1

Read counts expression

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    readcountExpBoxplot(DupMat = dm[[i]])
    cat("\n \n")
}

PND15_11_B1

PND15_12_B1

PND15_18_B2

PND15_20_B2

PND15_21_B2

PND15_22_B2

PND8_09_B1

PND8_10_B1

PND8_13_B2

PND8_14_B2

PND8_16_B2

PND8_17_B2

PNW21_01_B1

PNW21_02_B1

PNW21_03_B1

PNW21_04_B1

PNW21_05_B1

PNW21_07_B1

Read counts expression histogram

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    expressionHist(DupMat = dm[[i]])
    cat("\n \n")
}

PND15_11_B1

PND15_12_B1

PND15_18_B2

PND15_20_B2

PND15_21_B2

PND15_22_B2

PND8_09_B1

PND8_10_B1

PND8_13_B2

PND8_14_B2

PND8_16_B2

PND8_17_B2

PNW21_01_B1

PNW21_02_B1

PNW21_03_B1

PNW21_04_B1

PNW21_05_B1

PNW21_07_B1

Comparison of multi-mapping RPK and uniquely-mapping RPK

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    plot(log2(dm[[i]]$RPK), log2(dm[[i]]$RPKMulti), xlab = "Reads per kb (uniquely mapping reads only)", 
        ylab = "Reads per kb (all including multimappers, non-weighted)")
    cat("\n \n")
}

PND15_11_B1

PND15_12_B1

PND15_18_B2

PND15_20_B2

PND15_21_B2

PND15_22_B2

PND8_09_B1

PND8_10_B1

PND8_13_B2

PND8_14_B2

PND8_16_B2

PND8_17_B2

PNW21_01_B1

PNW21_02_B1

PNW21_03_B1

PNW21_04_B1

PNW21_05_B1

PNW21_07_B1

Connection between possible PCR artefacts and GC content

## set up biomart connection for mouse (needs internet connection)
ensm <- useMart("ensembl")
ensm <- useDataset("mmusculus_gene_ensembl", mart = ensm)

## get a table which has the gene GC content for the IDs that have been used to
## generate the table
tr <- getBM(attributes = c("ensembl_gene_id", "percentage_gene_gc_content"), values = TRUE, 
    mart = ensm)

## create a GC vector with IDs as element names
mgi.gc <- tr$percentage_gene_gc_content
names(mgi.gc) <- tr$ensembl_gene_id

Check distribution of annotated gene GC content (in %)

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")

    ## using dm duplication matrix that comes with the package add GC content to our
    ## demo data and keep only subset for which we can retrieve data
    keep <- dm[[i]]$ID %in% tr$ensembl_gene_id
    dm.gc <- dm[[i]][keep, ]
    dm.gc$gc <- mgi.gc[dm.gc$ID]

    boxplot(dm.gc$gc, main = "Gene GC content", ylab = "% GC")
    cat("\n \n")
}

PND15_11_B1

PND15_12_B1

PND15_18_B2

PND15_20_B2

PND15_21_B2

PND15_22_B2

PND8_09_B1

PND8_10_B1

PND8_13_B2

PND8_14_B2

PND8_16_B2

PND8_17_B2

PNW21_01_B1

PNW21_02_B1

PNW21_03_B1

PNW21_04_B1

PNW21_05_B1

PNW21_07_B1

Compare the dependence of duplication rate on expression level independently for below and above median GC genes

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")

    keep <- dm[[i]]$ID %in% tr$ensembl_gene_id
    dm.gc <- dm[[i]][keep, ]
    dm.gc$gc <- mgi.gc[dm.gc$ID]

    par(mfrow = c(1, 2))

    ## below median GC genes
    duprateExpDensPlot(dm.gc[dm.gc$gc <= 45, ], main = "below median GC genes")

    ## above median GC genes
    duprateExpDensPlot(dm.gc[dm.gc$gc >= 45, ], main = "above median GC genes")
    cat("\n \n")
}

PND15_11_B1

PND15_12_B1

PND15_18_B2

PND15_20_B2

PND15_21_B2

PND15_22_B2

PND8_09_B1

PND8_10_B1

PND8_13_B2

PND8_14_B2

PND8_16_B2

PND8_17_B2

PNW21_01_B1

PNW21_02_B1

PNW21_03_B1

PNW21_04_B1

PNW21_05_B1

PNW21_07_B1

References

report::cite_packages(sessionInfo())
  - Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2021). dplyr: A Grammar of Data Manipulation. R package version 1.0.5. https://CRAN.R-project.org/package=dplyr
  - Julien Barnier (2021). rmdformats: HTML Output Formats and Templates for 'rmarkdown' Documents. R package version 1.0.1. https://CRAN.R-project.org/package=rmdformats
  - Mapping identifiers for the integration of genomic datasets with the R/Bioconductor package biomaRt. Steffen Durinck, Paul T. Spellman, Ewan Birney and Wolfgang Huber, Nature Protocols 4, 1184-1191 (2009).
  - R Core Team (2021). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
  - Sergi Sayols, Denise Scherzinger and Holger Klein (2016): dupRadar: a Bioconductor package for the assessment of PCR artifacts in RNA-Seq data. BMC Bioinformatics, 17:428, doi:10.1186/s12859-016-1276-2
  - Simon Garnier, Noam Ross, Robert Rudis, Antônio P. Camargo, Marco Sciaini, and Cédric Scherer (2021). Rvision - Colorblind-Friendly Color Maps for R. R package version 0.4.0.
  - Simon Garnier, Noam Ross, Robert Rudis, Antônio P. Camargo, Marco Sciaini, and Cédric Scherer (2021). Rvision - Colorblind-Friendly Color Maps for R. R package version 0.6.0.
  - Yihui Xie (2021). knitr: A General-Purpose Package for Dynamic Report Generation in R. R package version 1.33.

SessionInfo

devtools::session_info() %>%
    details::details()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value                       
 version  R version 4.0.4 (2021-02-15)
 os       Ubuntu 16.04.7 LTS          
 system   x86_64, linux-gnu           
 ui       X11                         
 language (EN)                        
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Europe/Zurich               
 date     2021-06-24                  

─ Packages ───────────────────────────────────────────────────────────────────
 package       * version  date       lib source        
 AnnotationDbi   1.52.0   2020-10-27 [1] Bioconductor  
 askpass         1.1      2019-01-13 [1] CRAN (R 4.0.4)
 assertthat      0.2.1    2019-03-21 [1] CRAN (R 4.0.4)
 bayestestR      0.9.0    2021-04-08 [1] CRAN (R 4.0.4)
 Biobase         2.50.0   2020-10-27 [1] Bioconductor  
 BiocFileCache   1.14.0   2020-10-27 [1] Bioconductor  
 BiocGenerics    0.36.1   2021-04-16 [1] Bioconductor  
 biomaRt       * 2.46.3   2021-02-09 [1] Bioconductor  
 bit             4.0.4    2020-08-04 [1] CRAN (R 4.0.4)
 bit64           4.0.5    2020-08-30 [1] CRAN (R 4.0.4)
 blob            1.2.1    2020-01-20 [1] CRAN (R 4.0.4)
 bookdown        0.22     2021-04-22 [1] CRAN (R 4.0.4)
 bslib           0.2.4    2021-01-25 [1] CRAN (R 4.0.4)
 cachem          1.0.4    2021-02-13 [1] CRAN (R 4.0.4)
 callr           3.7.0    2021-04-20 [1] CRAN (R 4.0.4)
 cli             2.5.0    2021-04-26 [1] CRAN (R 4.0.4)
 clipr           0.7.1    2020-10-08 [1] CRAN (R 4.0.4)
 coda            0.19-4   2020-09-30 [1] CRAN (R 4.0.4)
 codetools       0.2-18   2020-11-04 [1] CRAN (R 4.0.4)
 colorspace      2.0-0    2020-11-11 [1] CRAN (R 4.0.4)
 crayon          1.4.1    2021-02-08 [1] CRAN (R 4.0.4)
 curl            4.3      2019-12-02 [1] CRAN (R 4.0.4)
 DBI             1.1.1    2021-01-15 [1] CRAN (R 4.0.4)
 dbplyr          2.1.1    2021-04-06 [1] CRAN (R 4.0.4)
 desc            1.3.0    2021-03-05 [1] CRAN (R 4.0.4)
 details         0.2.1    2020-01-12 [1] CRAN (R 4.0.4)
 devtools        2.4.2    2021-06-07 [1] CRAN (R 4.0.4)
 digest          0.6.27   2020-10-24 [1] CRAN (R 4.0.4)
 dplyr         * 1.0.5    2021-03-05 [1] CRAN (R 4.0.4)
 dupRadar      * 1.20.0   2020-10-27 [1] Bioconductor  
 effectsize      0.4.4-1  2021-04-05 [1] CRAN (R 4.0.4)
 ellipsis        0.3.1    2020-05-15 [1] CRAN (R 4.0.4)
 emmeans         1.6.0    2021-04-24 [1] CRAN (R 4.0.4)
 estimability    1.3      2018-02-11 [1] CRAN (R 4.0.4)
 evaluate        0.14     2019-05-28 [1] CRAN (R 4.0.4)
 fansi           0.4.2    2021-01-15 [1] CRAN (R 4.0.4)
 fastmap         1.1.0    2021-01-25 [1] CRAN (R 4.0.4)
 formatR         1.9      2021-04-14 [1] CRAN (R 4.0.4)
 fs              1.5.0    2020-07-31 [1] CRAN (R 4.0.4)
 generics        0.1.0    2020-10-31 [1] CRAN (R 4.0.4)
 ggplot2         3.3.3    2020-12-30 [1] CRAN (R 4.0.4)
 glue            1.4.2    2020-08-27 [1] CRAN (R 4.0.4)
 gridExtra       2.3      2017-09-09 [1] CRAN (R 4.0.4)
 gtable          0.3.0    2019-03-25 [1] CRAN (R 4.0.4)
 highr           0.9      2021-04-16 [1] CRAN (R 4.0.4)
 hms             1.0.0    2021-01-13 [1] CRAN (R 4.0.4)
 htmltools       0.5.1.1  2021-01-22 [1] CRAN (R 4.0.4)
 httr            1.4.2    2020-07-20 [1] CRAN (R 4.0.4)
 insight         0.13.2   2021-04-01 [1] CRAN (R 4.0.4)
 IRanges         2.24.1   2020-12-12 [1] Bioconductor  
 jquerylib       0.1.4    2021-04-26 [1] CRAN (R 4.0.4)
 jsonlite        1.7.2    2020-12-09 [1] CRAN (R 4.0.4)
 KernSmooth      2.23-18  2020-10-29 [1] CRAN (R 4.0.4)
 knitr         * 1.33     2021-04-24 [1] CRAN (R 4.0.4)
 lattice         0.20-41  2020-04-02 [1] CRAN (R 4.0.4)
 lifecycle       1.0.0    2021-02-15 [1] CRAN (R 4.0.4)
 magrittr        2.0.1    2020-11-17 [1] CRAN (R 4.0.4)
 MASS            7.3-53.1 2021-02-12 [1] CRAN (R 4.0.4)
 Matrix          1.3-2    2021-01-06 [1] CRAN (R 4.0.4)
 memoise         2.0.0    2021-01-26 [1] CRAN (R 4.0.4)
 multcomp        1.4-16   2021-02-08 [1] CRAN (R 4.0.4)
 munsell         0.5.0    2018-06-12 [1] CRAN (R 4.0.4)
 mvtnorm         1.1-1    2020-06-09 [1] CRAN (R 4.0.4)
 openssl         1.4.3    2020-09-18 [1] CRAN (R 4.0.4)
 parameters      0.13.0   2021-04-08 [1] CRAN (R 4.0.4)
 pillar          1.6.0    2021-04-13 [1] CRAN (R 4.0.4)
 pkgbuild        1.2.0    2020-12-15 [1] CRAN (R 4.0.4)
 pkgconfig       2.0.3    2019-09-22 [1] CRAN (R 4.0.4)
 pkgload         1.2.1    2021-04-06 [1] CRAN (R 4.0.4)
 png             0.1-7    2013-12-03 [1] CRAN (R 4.0.4)
 prettyunits     1.1.1    2020-01-24 [1] CRAN (R 4.0.4)
 processx        3.5.1    2021-04-04 [1] CRAN (R 4.0.4)
 progress        1.2.2    2019-05-16 [1] CRAN (R 4.0.4)
 ps              1.6.0    2021-02-28 [1] CRAN (R 4.0.4)
 purrr           0.3.4    2020-04-17 [1] CRAN (R 4.0.4)
 R6              2.5.0    2020-10-28 [1] CRAN (R 4.0.4)
 rappdirs        0.3.3    2021-01-31 [1] CRAN (R 4.0.4)
 Rcpp            1.0.6    2021-01-15 [1] CRAN (R 4.0.4)
 remotes         2.3.0    2021-04-01 [1] CRAN (R 4.0.4)
 report          0.3.0    2021-04-15 [1] CRAN (R 4.0.4)
 rlang           0.4.10   2020-12-30 [1] CRAN (R 4.0.4)
 rmarkdown       2.7      2021-02-19 [1] CRAN (R 4.0.4)
 rmdformats    * 1.0.1    2021-01-13 [1] CRAN (R 4.0.4)
 rprojroot       2.0.2    2020-11-15 [1] CRAN (R 4.0.4)
 RSQLite         2.2.7    2021-04-22 [1] CRAN (R 4.0.4)
 Rsubread        2.4.3    2021-03-16 [1] Bioconductor  
 S4Vectors       0.28.1   2020-12-09 [1] Bioconductor  
 sandwich        3.0-0    2020-10-02 [1] CRAN (R 4.0.4)
 sass            0.3.1    2021-01-24 [1] CRAN (R 4.0.4)
 scales          1.1.1    2020-05-11 [1] CRAN (R 4.0.4)
 sessioninfo     1.1.1    2018-11-05 [1] CRAN (R 4.0.4)
 stringi         1.5.3    2020-09-09 [1] CRAN (R 4.0.4)
 stringr         1.4.0    2019-02-10 [1] CRAN (R 4.0.4)
 survival        3.2-11   2021-04-26 [1] CRAN (R 4.0.4)
 testthat        3.0.2    2021-02-14 [1] CRAN (R 4.0.4)
 TH.data         1.0-10   2019-01-21 [1] CRAN (R 4.0.4)
 tibble          3.1.1    2021-04-18 [1] CRAN (R 4.0.4)
 tidyselect      1.1.0    2020-05-11 [1] CRAN (R 4.0.4)
 usethis         2.0.1    2021-02-10 [1] CRAN (R 4.0.4)
 utf8            1.2.1    2021-03-12 [1] CRAN (R 4.0.4)
 vctrs           0.3.7    2021-03-29 [1] CRAN (R 4.0.4)
 viridis       * 0.6.0    2021-04-15 [1] CRAN (R 4.0.4)
 viridisLite   * 0.4.0    2021-04-13 [1] CRAN (R 4.0.4)
 withr           2.4.2    2021-04-18 [1] CRAN (R 4.0.4)
 xfun            0.24     2021-06-15 [1] CRAN (R 4.0.4)
 XML             3.99-0.6 2021-03-16 [1] CRAN (R 4.0.4)
 xml2            1.3.2    2020-04-23 [1] CRAN (R 4.0.4)
 xtable          1.8-4    2019-04-21 [1] CRAN (R 4.0.4)
 yaml            2.2.1    2020-02-01 [1] CRAN (R 4.0.4)
 zoo             1.8-9    2021-03-09 [1] CRAN (R 4.0.4)

[1] /home/ubuntu/R/x86_64-pc-linux-gnu-library/4.0
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library